import plotly
import matplotlib.pyplot as plt
import pandas as pd
import os
import plotly.io as pio
import plotly.graph_objects as go
Take the Gapminder Test: http://forms.gapminder.org/s3/test-2018. What score did you receive? Did any of the answers surprise you? Choose a question from the test, re-state it, and answer it using visualization and summarization. Provide a figure and any relevant output with your answer.
I scored 46 % in the test and I found many answers surprising.
I was surprised by life expectancy of the world. I guessed 60 but it is actually 70
Re-state the question:
Determine the trend of life expectancy between male and female
Insights: Women in general have a better life expectancy as compared to male, although the gap is decreasing over time
male = pd.read_csv(r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--life_expectancy_male--by--geo--time.csv")
female = pd.read_csv(r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--life_expectancy_female--by--geo--time.csv")
fig = go.Figure()
male = male.loc[male["geo"] == "usa"]
x = male["time"]
y = male["life_expectancy_male"]
fig.add_trace(
go.Bar(
x=x,
y=y,
name = "Male"
))
female = female.loc[female["geo"] == "usa"]
x = female["time"]
y = female["life_expectancy_female"]
fig.add_trace(
go.Bar(
x=x,
y=y,
name = "Female"
))
# Edit the layout
fig.update_layout(title='<b>Gapminder</b><br>Man Vs Female life expectancy',
xaxis_title='Year',
yaxis_title='Life Expectancy')
fig.show()
Visualize the distribution of income (GDP / capita) across countries and continents, and how the distribution of income changes over time. Interpret the visualization and what you notice. Are they any notable trends and/or deviations from that trend? What caveats apply to your conclusions?
def get_datatset(col1, file1, file2):
gdpcapita_df = pd.read_csv(file1)
country_df = pd.read_csv(file2)[['country', 'world_4region']]
continent_df = pd.merge(gdpcapita_df, country_df, left_on = 'geo', right_on='country')
gdp_continent_avg_df = continent_df.groupby(["world_4region", "time"])[col1].mean().reset_index()
gdp_country_avg_df = continent_df.groupby(["geo", "time"])[col1].mean().reset_index()
gdp_country_avg_with_continent_df = pd.merge(gdp_country_avg_df, country_df, left_on = 'geo', right_on='country')
return gdp_continent_avg_df, gdp_country_avg_with_continent_df
file1 = r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--income_per_person_gdppercapita_ppp_inflation_adjusted--by--geo--time.csv"
file2 = r'data/ddf--gapminder--systema_globalis-master/ddf--entities--geo--country.csv'
col1 = "income_per_person_gdppercapita_ppp_inflation_adjusted"
gdp_continent_avg_df, gdp_country_avg_df = get_datatset(col1, file1, file2)
def plot(df, x1, x2, x3, t):
colors = ['blue', 'orange', 'green', 'red', 'purple']
data = [dict(
type = 'line',
x = df[x1],
y = df[x2],
text = df[x3],
hoverinfo = 'text',
opacity = 0.8,
transforms = [
dict(
type = 'groupby',
groups = df[x3],
)],
)]
layout = dict(
title = t,
xaxis = dict(
title = x1,
),
yaxis = dict(
title = x2,
type = 'log'
)
)
fig_dict = dict(data=data, layout=layout)
fig.update_layout(title=go.layout.Title(text="Extreme Poverty last 20 years"),
xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text="Years")),
yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text="Percentage of Extreme Poverty")))
pio.show(fig_dict, validate=False)
data = [i for i in os.listdir(r"data/ddf--gapminder--systema_globalis-master") if "gdp" in i ]
set(gdp_country_avg_df["world_4region"])
x1 = 'time'
x2 = 'income_per_person_gdppercapita_ppp_inflation_adjusted'
x3 = 'world_4region'
title = '<b>Gapminder</b><br>Average GDP Per Continent'
plot(gdp_continent_avg_df, x1, x2, x3, title)
x1 = 'time'
x2 = 'income_per_person_gdppercapita_ppp_inflation_adjusted'
x3 = 'world_4region'
title = '<b>Gapminder</b><br>Average GDP Per Continent after the year 2000'
df = gdp_continent_avg_df.loc[gdp_continent_avg_df["time"] > 2000]
plot(df, x1, x2, x3, title)
x1 = 'time'
x2 = 'income_per_person_gdppercapita_ppp_inflation_adjusted'
x3 = 'geo'
title = '<b>Gapminder</b><br>Average GDP Per Country'
plot(gdp_country_avg_df, x1, x2, x3, title)
x1 = 'time'
x2 = 'income_per_person_gdppercapita_ppp_inflation_adjusted'
x3 = 'geo'
df = gdp_country_avg_df.loc[gdp_country_avg_df["world_4region"] == "africa"]
title = '<b>Gapminder</b><br>Average GDP per country in Africa'
plot(df, x1, x2, x3, title)
Use visualization to investigate the relationship between income (GDP / capita), life expectancy, and child mortality over time. How does each measure change over time within each continent? Interpret your visualizations, noting any trends and/or outliers.
population_file = r"data/ddf--gapminder--systema_globalis-master/dataset/ddf--gapminder--systema_globalis-master/ddf--datapoints--population_total--by--geo--time.csv"
gdp_file = r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--income_per_person_gdppercapita_ppp_inflation_adjusted--by--geo--time.csv"
country_file = r"data/ddf--gapminder--systema_globalis-master/ddf--entities--geo--country.csv"
lifeexpectancy_file = r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--life_expectancy_years--by--geo--time.csv"
child_mort_file = r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--child_mortality_0_5_year_olds_dying_per_1000_born--by--geo--time.csv"
lifeexpectancy_df = pd.read_csv(lifeexpectancy_file)
child_mort_df = pd.read_csv(child_mort_file)
country_df = pd.read_csv(country_file)
child_mort_df
df_life_exp_child_mor = lifeexpectancy_df.merge(child_mort_df).merge(country_df[["country", "world_4region"]], left_on = 'geo', right_on='country')
df_life_exp_child_mor_df = df_life_exp_child_mor.groupby(["world_4region", "time"])["life_expectancy_years"].mean().reset_index()
df_life_exp_child_mor_df.head()
x1 = 'time'
x2 = 'life_expectancy_years'
x3 = 'world_4region'
df = df_life_exp_child_mor_df
title = '<b>Gapminder</b><br>Average life expectancy per continent'
plot(df, x1, x2, x3, title)
df_life_exp_child_mor_df = df_life_exp_child_mor.groupby(["world_4region", "time"])["child_mortality_0_5_year_olds_dying_per_1000_born"].mean().reset_index()
df_life_exp_child_mor_df.head()
x1 = 'time'
x2 = 'child_mortality_0_5_year_olds_dying_per_1000_born'
x3 = 'world_4region'
df = df_life_exp_child_mor_df
title = '<b>Gapminder</b><br>Average child mortality per continent'
plot(df, x1, x2, x3, title)
x1 = 'time'
x2 = 'income_per_person_gdppercapita_ppp_inflation_adjusted'
x3 = 'world_4region'
title = '<b>Gapminder</b><br>Average GDP Per Continent'
plot(gdp_continent_avg_df, x1, x2, x3, title)
Choose two variables you have not investigated yet, and visualize their distributions, their relationship with each other, and how these change over time. Interpret your visualizations, noting any trends and/or outliers.
I chose population dataset and the unemployment rate for individuals between 15 to 24. I have attempted to compare the unemployment trend between India and USA.
It is easy to notice that since the population of India is much greater the increase in unemployment has seen a steep increase whereas the unemployment in USA has been an interesting trend, that shows a periodic up and down trend over the years.
population_file = r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--population_total--by--geo--time.csv"
unemploy_file = r"data/ddf--gapminder--systema_globalis-master/ddf--datapoints--aged_15_24_unemployment_rate_percent--by--geo--time.csv"
population_df = pd.read_csv(population_file)
unemploy_df = pd.read_csv(unemploy_file)
unempl_pop_df = population_df.merge(unemploy_df)
fig = go.Figure()
unempl_pop_ind_df = unempl_pop_df.loc[unempl_pop_df["geo"] == "ind"]
x = unempl_pop_ind_df["time"]
y = unempl_pop_ind_df["aged_15_24_unemployment_rate_percent"] * unempl_pop_ind_df["population_total"]
fig.add_trace(
go.Scatter(
x=x,
y=y,
name = "Unemployment count in India"))
unempl_pop_usa_df = unempl_pop_df.loc[unempl_pop_df["geo"] == "usa"]
x = unempl_pop_usa_df["time"]
y = unempl_pop_usa_df["aged_15_24_unemployment_rate_percent"] * unempl_pop_usa_df["population_total"]
fig.add_trace(
go.Scatter(
x=x,
y=y,
name = "Unemployment count in USA"))
# Edit the layout
fig.update_layout(title='<b>Gapminder</b><br>Comparison of unemployment rate between India Vs USA',
xaxis_title='Year',
yaxis_title='unemployment')
fig.show()
Did you use static or interactive plots to answer the previous problems? Explore the data using the interactive visualization tools at https://www.gapminder.org/tools, and watch the TED talk “The best stats you’ve ever seen” at https://www.youtube.com/watch?v=hVimVzgtD6w. Discuss the advantages, disadvantages, and relative usefulness of using interactive/dynamic visualizations versus static visualizations.
I used interactive plots
Advantages of interactive plots / disadvantages of static plots:
- The plots contain alot of information and it keeps the viewers enganged.
- Gives the ability to a viewer to focus on several aspects of plots.
- Useful when alot of information has to be conveyed.
Disadvantages of interactive plots / advantages of static plots:
- More difficult to make and takes more time.
- Static plots are suitable when the information to be conveyed is simple in nature.